from idlelib.iomenu import encoding

from docx import Document
import re

def is_number(value):
    pattern = r'^[-+]?(\d+|\d+\.\d*|\.\d+)([eE][-+]?\d+)?$'
    return bool(re.match(pattern, str(value)))

def handle_text(text):
    txt = text.replace("\t", "")
    txt = txt.replace("\n", "")
    txt = txt.replace(" ", "")
    return txt

def extract_chinese_characters(doc_path):
    # 加载Word文档
    doc = Document(doc_path)
    chinese_chars = []

    # 遍历文档中的每个段落
    for para in doc.paragraphs:
        # 使用正则表达式提取汉字
        text =  handle_text(para.text)
        if text=='' or is_number(text) :continue
        chinese_chars.append(text+"\n")

    for table in doc.tables:
        chinese_chars.append('**********************表格**********************')
        for row in table.rows:
            for cell in row.cells:
                text = handle_text(cell.text)
                chinese_chars.append(text+"\n")

    # 返回所有提取的汉字列表
    return chinese_chars


# Word文档路径
name = '基本国情20250507'
doc_path = f'{name}.docx'
chinese_chars = extract_chinese_characters(doc_path)
with open(f'{name}.txt','w',encoding='utf-8') as file:
    file.writelines(chinese_chars)